HW 03

Author

Weston Scott

1 - Du Bois challenge.

income <- read.csv("data/income.csv")

income <- income |>
    mutate(
        Average_Income = as.integer(Average_Income),
        ClassLabel = factor(
            paste0(Class, " | $", 
                   format(Average_Income, 
                          big.mark = ",")
                  ),
            levels = unique(paste0(Class, " | $", 
                               format(Average_Income, 
                                      big.mark = ",")))
            )
    ) |>

    pivot_longer(
        cols = Rent:Other, ## list slice-like syntax to get the ordered columns
        names_to = "Category", 
        values_to = "Percent"
    ) |>

    mutate(
        Category = factor(Category, 
                          levels = c("Other", 
                                     "Tax", 
                                     "Clothes", 
                                     "Food", 
                                     "Rent")),
        text_color = ifelse(Category == "Rent", 
                            "white", 
                            "black")
    ) |>

    group_by(ClassLabel) |>
    mutate(pos = cumsum(Percent) - Percent / 2) |>
    ungroup() |> glimpse()
Rows: 35
Columns: 7
$ Class          <chr> "$100-200", "$100-200", "$100-200", "$10…
$ Average_Income <int> 139, 139, 139, 139, 139, 249, 249, 249, …
$ ClassLabel     <fct> "$100-200 | $  139", "$100-200 | $  139"…
$ Category       <fct> Rent, Food, Clothes, Tax, Other, Rent, F…
$ Percent        <dbl> 19.0, 43.0, 28.0, 9.9, 0.1, 22.0, 47.0, …
$ text_color     <chr> "white", "black", "black", "black", "bla…
$ pos            <dbl> 9.50, 40.50, 76.00, 94.95, 99.95, 11.00,…
category_colors <- c(
    Rent = "black",
    Food = "slateblue4",
    Clothes = "rosybrown2",
    Tax = "gray60",
    Other = "tan"
)
du_bois <- ggplot(
    income, 
    aes(x = fct_rev(ClassLabel), 
        y = Percent, 
        fill = Category)
    ) +

    geom_col(color = "black", 
             width = 0.7) +

    geom_text(data = filter(income, 
                            Percent > 1),
              aes(label = paste0(round(Percent, 1), "%"), 
                  y = pos, 
                  color = text_color), 
              size = 2.5
    ) +
    scale_fill_manual(values = category_colors) +
    scale_color_manual(values = c("white" = "white", 
                                  "black" = "black")) +

    coord_flip() +
    scale_y_continuous(labels = NULL) +
  
    annotate("text", 
             x = c(1, 2.5, 4.5, 6.5), 
             y = 102, 
             label = c("Well-To-Do", 
                       "Comfortable", 
                       "Fair", 
                       "Poor"), 
             size = 2.5, 
             angle = 90) +
  
  labs(
      x = NULL, 
      y = NULL, 
      title = "Annual Expenditure For Provided Data",
      text_color = NULL
  ) +

  theme(
      axis.title = element_blank(),
      axis.text.y = element_text(face = "bold", 
                                 size = 8),
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.position = "top",
      plot.title = element_text(hjust = 0.5, margin = margin(b = 10))
  ) +

  guides(fill = guide_legend(reverse = TRUE), 
         color = "none")

ggbackground(du_bois, "images/paper.jpg")

2 - COVID survey - interpret

The first relationship I noticed was the question “Had flu vaccine this year”. I am understanding that the error bar lengths for the “No” response seem much longer than those of the “Yes” response for all top level COVID questions. It would be my assumption that those individuals that do not obtain the flu shot likely follow more information on the subject matter of whether or not the COVID vaccine is safe versus not safe. Those that get the flu shot appear to have responses overall that are more centrally located to the means, telling me that they are either not informed or maybe are simply not as concerned with the situation as compared to those that did not get the flu shot.

Example 2

Looking at the “I trust information that I have received about the vaccines” column has a very small deviation from a localized mean across the board. Every combination appears to have show low confidence, leading to small error bars, with more survey results in the lower values for that question.

Example 3

Something interesting that I think is quite visible with the provided image is that for the entire set of data, there are 2 question columns that tend to have the most diverse results, meaning the widest spread of answers, or the 10th and 90th percentile bars are on average the longest. The questions are “Based on my understanding, I believe the vaccine is safe” and “I am concerned about the safety and side effects of the vaccine.” I would say that these results would directly reflect information (or misinformation) dispersed to the masses. The length of the bars suggest that there are more people on either end of the spectrum for the questions then there are neutral responses.

Example 4

A final observation I made looking at this dataset involved the age demographic against the “Based on my understanding, I believe the vaccine is safe”. There is are large error bars for each age group except the youngest group. I attribute that to simply youth not being as informed relative to the information that is being dispersed. The spread of the survey results for the youth is minimal.

3 - COVID survey - reconstruct

covid_survey <- read.csv("data/covid-survey.csv")

covid_survey <- covid_survey |> 
    row_to_names(row_number = 1) |>
    clean_names() |>
    mutate(
        across(everything(), 
               ~ na_if(., ""))
    ) |>

    filter(
        if_any(-response_id, 
               ~ !is.na(.))
    )
covid_survey <- covid_survey |>
    mutate(
        exp_already_vax = recode(exp_already_vax,
                                 "1" ="Yes", 
                                 "0" = "No"),
        exp_flu_vax = recode(exp_flu_vax,
                             "1" ="Yes", 
                             "0" = "No"),
        exp_profession = recode(exp_profession,
                                "1" = "Nursing", 
                                "0" = "Medical"),
        exp_gender = recode(exp_gender,
                            "0" = "Male",
                            "1" = "Female",
                            "3" = "Non-binary/Third gender",
                            "4" = "Prefer not to say"),
        exp_race = recode(exp_race,
                          "1" = "American Indian/Alaskan Native",
                          "2" = "Asian",
                          "3" = "Black/African American",
                          "4" = "Native Hawaiian/Other Pacific Islander",
                          "5" = "White"),
        exp_ethnicity = recode(exp_ethnicity,
                               "1" = "Hispanic/Latino",
                               "2" = "Non-Hispanic/Non-Latino"),
        exp_age_bin = recode(exp_age_bin,
                             "0" = "<20",
                             "20" = "21-25",
                             "25" = "26-30",
                             "30" = ">30")
    
    )
covid_survey_longer <- covid_survey |>
    pivot_longer(
        cols = starts_with("exp_"),
        names_to = "explanatory",
        values_to = "explanatory_value"
    ) |>
    filter(!is.na(explanatory_value)) |>
    pivot_longer(
        cols = starts_with("resp_"),
        names_to = "response",
        values_to = "response_value"
    )

covid_survey_longer
# A tibble: 43,428 × 5
   response_id explanatory    explanatory_value response         
   <chr>       <chr>          <chr>             <chr>            
 1 1           exp_profession Nursing           resp_safety      
 2 1           exp_profession Nursing           resp_confidence_…
 3 1           exp_profession Nursing           resp_concern_saf…
 4 1           exp_profession Nursing           resp_feel_safe_a…
 5 1           exp_profession Nursing           resp_will_recomm…
 6 1           exp_profession Nursing           resp_trust_info  
 7 1           exp_flu_vax    Yes               resp_safety      
 8 1           exp_flu_vax    Yes               resp_confidence_…
 9 1           exp_flu_vax    Yes               resp_concern_saf…
10 1           exp_flu_vax    Yes               resp_feel_safe_a…
# ℹ 43,418 more rows
# ℹ 1 more variable: response_value <chr>
covid_survey_summary_stats_by_group <- covid_survey_longer |>
    group_by(explanatory, explanatory_value, response) |>
    summarize(
        mean = mean(as.numeric(response_value), 
                    na.rm = TRUE),
        low  = quantile(as.numeric(response_value), 
                        probs = 0.1, 
                        na.rm = TRUE),
        high = quantile(as.numeric(response_value), 
                        probs = 0.9, 
                        na.rm = TRUE)
  )

covid_survey_summary_stats_by_group
# A tibble: 126 × 6
# Groups:   explanatory, explanatory_value [21]
   explanatory explanatory_value response        mean   low  high
   <chr>       <chr>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 21-25             resp_concern_…  3.32     2     5
 2 exp_age_bin 21-25             resp_confiden…  1.31     1     2
 3 exp_age_bin 21-25             resp_feel_saf…  1.20     1     2
 4 exp_age_bin 21-25             resp_safety     1.95     1     5
 5 exp_age_bin 21-25             resp_trust_in…  1.29     1     2
 6 exp_age_bin 21-25             resp_will_rec…  1.09     1     1
 7 exp_age_bin 26-30             resp_concern_…  3.35     1     5
 8 exp_age_bin 26-30             resp_confiden…  1.40     1     2
 9 exp_age_bin 26-30             resp_feel_saf…  1.29     1     2
10 exp_age_bin 26-30             resp_safety     2.16     1     5
# ℹ 116 more rows
covid_survey_summary_stats_all <- covid_survey_longer |>
    group_by(response) |>
    summarize(
        mean = mean(as.numeric(response_value), 
                    na.rm = TRUE),
        low  = quantile(as.numeric(response_value), 
                        probs = 0.1, 
                        na.rm = TRUE),
        high = quantile(as.numeric(response_value), 
                        probs = 0.9, 
                        na.rm = TRUE),
        explanatory = "All",
        explanatory_value = ""

  )

covid_survey_summary_stats_all
# A tibble: 6 × 6
  response         mean   low  high explanatory explanatory_value
  <chr>           <dbl> <dbl> <dbl> <chr>       <chr>            
1 resp_concern_s…  3.28     1     5 All         ""               
2 resp_confidenc…  1.43     1     2 All         ""               
3 resp_feel_safe…  1.36     1     2 All         ""               
4 resp_safety      2.03     1     5 All         ""               
5 resp_trust_info  1.40     1     2 All         ""               
6 resp_will_reco…  1.21     1     2 All         ""               
covid_survey_summary_stats <- bind_rows(
    covid_survey_summary_stats_all,
    covid_survey_summary_stats_by_group
)
covid_survey_summary_stats
# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <chr>            
 1 resp_concern_…  3.28     1     5 All         ""               
 2 resp_confiden…  1.43     1     2 All         ""               
 3 resp_feel_saf…  1.36     1     2 All         ""               
 4 resp_safety     2.03     1     5 All         ""               
 5 resp_trust_in…  1.40     1     2 All         ""               
 6 resp_will_rec…  1.21     1     2 All         ""               
 7 resp_concern_…  3.32     2     5 exp_age_bin "21-25"          
 8 resp_confiden…  1.31     1     2 exp_age_bin "21-25"          
 9 resp_feel_saf…  1.20     1     2 exp_age_bin "21-25"          
10 resp_safety     1.95     1     5 exp_age_bin "21-25"          
# ℹ 122 more rows
response_labels <- c(
    "resp_safety" = "Based on my understanding, I believe the vaccine is safe",
    "resp_feel_safe_at_work" = "Getting the vaccine will make me feel safer at work",
    "resp_concern_safety" = "I am concerned about the safety and side effects of the vaccine",
    "resp_confidence_science" = "I am confident in the scientific vetting process for the new COVID vaccines",
    "resp_trust_info" = "I trust the information that I have received about the vaccines",
    "resp_will_recommend" = "I will recommend the vaccine to family, friends, and community members"
)

explanatory_labels <- c(
    "All" = "All",
    "exp_age_bin" = "Age",
    "exp_gender" = "Gender",
    "exp_race" = "Race",
    "exp_ethnicity" = "Ethnicity",
    "exp_profession" = "Profession",
    "exp_already_vax" = "Had COVID vaccine",
    "exp_flu_vax" = "Had flu vaccine this year"
)

covid_survey_summary_stats <- covid_survey_summary_stats |>
    mutate(
        response = factor(response, 
                          levels = names(response_labels),
                          labels = unname(response_labels)),
        explanatory = factor(explanatory, 
                             levels = names(explanatory_labels),
                             labels = unname(explanatory_labels))
    ) 
ggplot(
    covid_survey_summary_stats, 
    aes(x = mean, 
        y = explanatory_value, 
        xmin = low, 
        xmax = high)
) +

geom_pointrange(size = 0.2) +
geom_errorbar(
    aes(xmin = low, 
        xmax = high), 
        size = 0.2, 
        width = 0.25
) +

facet_grid(
    explanatory ~ response, 
    labeller = labeller(
        explanatory = label_wrap_gen(15,
                                     multi_line = TRUE),
        response = label_wrap_gen(15, 
                                  multi_line = TRUE)
    ),
    space = "free_y",
    scales = "free_y"
) +

scale_x_continuous(breaks = 1:5, 
                   limits = c(1,5)) +

labs(x = "Mean Likert score (Error bars range from 10th to 90th percentile)",
     y = NULL) +

theme_minimal(base_size = 8) +

theme(
    strip.background = element_rect(fill = "gray90", 
                                    color = 'black', 
                                    size = 0.3),
    strip.text.y.right = element_text(angle = 0),
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
) +

coord_cartesian(clip = "off")

4 - COVID survey - re-reconstruct

5 - COVID survey - another view